In [None]:
import numpy as np
import pandas as pd
from __future__ import division
from random import shuffle
import xmltodict
import json
import collections
from collections import Counter

import nltk
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import *
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# view all columns of pandas df
pd.set_option('display.max_columns', None)

# nltk.download('wordnet')      # download wordnet if it's not already downloaded

with open ("dreambank-public.xml") as f:
    doc = xmltodict.parse(f.read())

def convert(data):
    if isinstance(data, basestring):
        return str(data)
    elif isinstance(data, collections.Mapping):
        return dict(map(convert, data.iteritems()))
    elif isinstance(data, collections.Iterable):
        return type(data)(map(convert, data))
    else:
        return data

def left(s, amount):
    return s[:amount]

def right(s, amount):
    return s[-amount:]

def mid(s, offset, amount):
    return s[offset:offset+amount]

# Exploratory Analysis

First we want to print the data to see which fields we are given and how the data looks.

In [None]:
for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' 
    print '  ID: ' + dreamer['id']
    print '  type: ' + dreamer['type']
    print '  sex: ' + dreamer['sex']
    print '  age: ' + dreamer['age']
    
    try:
        print '  time: ' + dreamer['time']
    except:
        pass
    
    print '  sample dream: ' 
    
    odict = dreamer['dream'][0]
    for key, value in odict.items():
        if convert(key) == 'report':
            print '    report: ' + left(convert(value), 200) + '...'
        else:
            print '    ' + convert(key) + ': ' + str(convert(value))
        
    print '\n'

In [None]:
print '---Dream collections from individuals---' + '\n'
MultIDs = ['b', 'madeline1-hs', 'madeline2-dorms', 'madeline3-offcampus', 'phil1', 'phil2', 'vietnam_vet']
NumberOfSeries = 1

for dreamer in doc['dreambank']['collection']:
    if dreamer['type'] == 'series':
        print '{' + dreamer['id']  + '} ' + dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'    
        dreamer['w266ID'] = NumberOfSeries
        
        # Assign a dreamer ID that groups the same dreamers together 
        # and skips the dream collections of multiple dreamers
        print 'w266 ID: ' + str(dreamer['w266ID'])
        if dreamer['id'] not in MultIDs:
            print '\n'
            NumberOfSeries += 1
    else:
        dreamer['w266ID'] = 0
        
print "Total Number of individuals to test vs. 'others': " + str(NumberOfSeries - 1)

In [None]:
DreamNum = 0

for dreamer in doc['dreambank']['collection']:
    for odict in dreamer['dream']:
        for key, value in odict.items():            
            if convert(key) == 'report':
                DreamNum += 1

print "Total Dreams: " + str(DreamNum)

# EDA - Reduce To Noun-Only And Lemmatize

For our topic modeling, we will want to lemmatize the corpus and reduce to nouns-only. However, before we get to topic modeling, it will be helpful to test out the lemmatization and noun-only reduction techniques.

In [None]:
# reduce to noun-only and lemmatize

lmtzr = WordNetLemmatizer()

for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'
    
    print '  noun-only sample dream: ' 
    
    odict = dreamer['dream'][0]
    
    # (1) tokenize dream text
    # (2) tag with PoS
    # (3) reduce to noun-only
    # (4) lemmatize nouns
    # (5) get frequency counts of the lemmatized nouns
    for key, value in odict.items():
        if convert(key) == 'report':
            text = nltk.tokenize.word_tokenize(value)
            pos = nltk.pos_tag(text)
            noun_only = [w[0] for w in pos if w[1].startswith('N')]
            lmtz_noun_only = [lmtzr.lemmatize(word) for word in noun_only]
            counts = Counter(lmtz_noun_only)
            print counts
        
    print '\n'

# Create Dataframe

In [None]:
ID = 0
W266ID = 0
DreamBankID = ''
DreamNumber = ''
Name = ''
Sex = ''
Dream = ''
HasDream = 0

df = pd.DataFrame(columns=["ID", "W266ID", "DreamBankID", "DreamNumber", "Name", "Sex", "Dream"])

for dreamer in doc['dreambank']['collection']:
    Name = dreamer['name'] 
    DreamBankID = dreamer['id']
    W266ID = int(dreamer['w266ID'])
    Sex = dreamer['sex']
    
    for odict in dreamer['dream']:
        HasDream = 0
        for key, value in odict.items():
            if convert(key) == 'report':
                Dream = convert(value)
                HasDream = 1
            if convert(key) == 'number':
                DreamNumber = convert(value)
        
        if HasDream == 1:
            ID += 1
            df = df.append({
                "ID": ID,
                "W266ID": W266ID,
                "DreamBankID": DreamBankID,
                "DreamNumber": DreamNumber,
                "Name": Name,
                "Sex": Sex,
                "Dream": Dream
                }, ignore_index=True)        
#     print '\n'

# print df
print "Total Dreams: " + str(ID)
print "\n"

df.head()

In [None]:
print 'Males: ' + str(len(df[df['Sex']=='M']))
print 'Females: ' + str(len(df[df['Sex']=='F']))
print 'Total: ' + str(len(df[df['Sex']=='M']) + len(df[df['Sex']=='F']))

# Modeling

We want to create a different model for each dreamer (i.e. a classifier identifying one vs all-others for each dreamer). This will allow us to identify the most predictive words in identifying each dreamer.

In [None]:
# randomly shuffle dataframe
# set seed for consistency while running
np.random.seed(0)
df = df.sample(frac=1).reset_index(drop=True)
    
# add 41 flag columns denoting dreamers (one col per dreamer) 
dreamer_flag = pd.get_dummies(df['W266ID'], prefix='Dreamer')
df = pd.concat([df, dreamer_flag], axis=1)

# create vocab from all dreams
dreams_flat = df['Dream'].values.flatten().tolist()
dreams_list = " ".join(dreams_flat)
vocab = list(set(nltk.tokenize.word_tokenize(dreams_list)))

def split_data(df, W266ID, train=0.6):        
    # column for "our" dreamer
    dreamer_label = 'Dreamer_' + str(W266ID)

    # make 60/40 split of train/test
    # test will be evenly split between dev and test in the next step
    num_train = int(len(df) * train)
    num_test = int(len(df) * (1-train)) 

    train_data, train_labels = df['Dream'][:num_train], df[dreamer_label][:num_train]
    dev_data, dev_labels = df['Dream'][-num_test : -num_test//2], df[dreamer_label][-num_test : -num_test//2] 
    test_data, test_labels = df['Dream'][-num_test//2:], df[dreamer_label][-num_test//2:]

    return train_data, train_labels, dev_data, dev_labels, test_data, test_labels

In [None]:
# split data for dreamer1 - Alta
train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=1)
    
# Create a Bag-of-Words Vectorizer
vec = CountVectorizer(vocabulary=vocab)
vec_bow_train_data = vec.fit_transform(train_data)
vec_bow_dev_data = vec.transform(dev_data)  

# Create a Tfidf Vectorizer
vec_tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf_train_data = vec_tfidf.fit_transform(train_data)
vec_tfidf_dev_data   = vec_tfidf.transform(dev_data)

best_lr_score = 0

## Logistic reg
for c in (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500):
    bow_log = LogisticRegression(C = c)
    bow_log.fit(vec_bow_train_data, train_labels)
    
    tfidf_logit_model = LogisticRegression(C=c)
    tfidf_logit_model.fit(vec_tfidf_train_data, train_labels)
    
    f1_bow_lr_score = metrics.f1_score(dev_labels, bow_log.predict(vec_bow_dev_data), average='macro')    
    f1_tfidf_lr_score = metrics.f1_score(dev_labels, tfidf_logit_model.predict(vec_tfidf_dev_data), average='macro') 

    print 'Logistic Reg:\t C=%3.4f\t BOW: F1-score=%3.3f\t TFIDF: F1-score=%3.3f' % (c, f1_bow_lr_score, f1_tfidf_lr_score)

    if f1_bow_lr_score > best_lr_score:
        best_lr_score = f1_bow_lr_score
        best_C = c 
        vectorizer = 'BOW'
        
    if f1_tfidf_lr_score > best_lr_score:
        best_lr_score = f1_tfidf_lr_score
        best_C = c
        vectorizer = 'TFIDF'

print ''
print 'Best model:\t C=%3.4f\t vectorizer = %s\t F1-score=%3.3f' % (best_C, vectorizer, best_lr_score)

In [None]:
# split data for dreamer2 - Angie
train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=2)
    
# Create a Bag-of-Words Vectorizer
vec = CountVectorizer(vocabulary=vocab)
vec_bow_train_data = vec.fit_transform(train_data)
vec_bow_dev_data = vec.transform(dev_data)  

# Create a Tfidf Vectorizer
vec_tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf_train_data = vec_tfidf.fit_transform(train_data)
vec_tfidf_dev_data   = vec_tfidf.transform(dev_data)

best_lr_score = 0

## Logistic reg
for c in (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500):
    bow_log = LogisticRegression(C = c)
    bow_log.fit(vec_bow_train_data, train_labels)
    
    tfidf_logit_model = LogisticRegression(C=c)
    tfidf_logit_model.fit(vec_tfidf_train_data, train_labels)
    
    f1_bow_lr_score = metrics.f1_score(dev_labels, bow_log.predict(vec_bow_dev_data), average='macro')    
    f1_tfidf_lr_score = metrics.f1_score(dev_labels, tfidf_logit_model.predict(vec_tfidf_dev_data), average='macro') 

    print 'Logistic Reg:\t C=%3.4f\t BOW: F1-score=%3.3f\t TFIDF: F1-score=%3.3f' % (c, f1_bow_lr_score, f1_tfidf_lr_score)

    if f1_bow_lr_score > best_lr_score:
        best_lr_score = f1_bow_lr_score
        best_C = c 
        vectorizer = 'BOW'
        
    if f1_tfidf_lr_score > best_lr_score:
        best_lr_score = f1_tfidf_lr_score
        best_C = c
        vectorizer = 'TFIDF'

print ''
print 'Best model:\t C=%3.4f\t vectorizer = %s\t F1-score=%3.3f' % (best_C, vectorizer, best_lr_score)

In [None]:
# split data for dreamer3 - Arlie
train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=3)
    
# Create a Bag-of-Words Vectorizer
vec = CountVectorizer(vocabulary=vocab)
vec_bow_train_data = vec.fit_transform(train_data)
vec_bow_dev_data = vec.transform(dev_data)  

# Create a Tfidf Vectorizer
vec_tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf_train_data = vec_tfidf.fit_transform(train_data)
vec_tfidf_dev_data   = vec_tfidf.transform(dev_data)

best_lr_score = 0

## Logistic reg
for c in (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500):
    bow_log = LogisticRegression(C = c)
    bow_log.fit(vec_bow_train_data, train_labels)
    
    tfidf_logit_model = LogisticRegression(C=c)
    tfidf_logit_model.fit(vec_tfidf_train_data, train_labels)
    
    f1_bow_lr_score = metrics.f1_score(dev_labels, bow_log.predict(vec_bow_dev_data), average='macro')    
    f1_tfidf_lr_score = metrics.f1_score(dev_labels, tfidf_logit_model.predict(vec_tfidf_dev_data), average='macro') 

    print 'Logistic Reg:\t C=%3.4f\t BOW: F1-score=%3.3f\t TFIDF: F1-score=%3.3f' % (c, f1_bow_lr_score, f1_tfidf_lr_score)

    if f1_bow_lr_score > best_lr_score:
        best_lr_score = f1_bow_lr_score
        best_C = c 
        vectorizer = 'BOW'
        
    if f1_tfidf_lr_score > best_lr_score:
        best_lr_score = f1_tfidf_lr_score
        best_C = c
        vectorizer = 'TFIDF'

print ''
print 'Best model:\t C=%3.4f\t vectorizer = %s\t F1-score=%3.3f' % (best_C, vectorizer, best_lr_score)

In [None]:
# # split data for dreamer1 - Alta
# train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=1)
    
# # Create a Bag-of-Words Vectorizer
# vec = CountVectorizer(vocabulary=vocab)
# vec_train_data = vec.fit_transform(train_data)
# vec_dev_data = vec.transform(dev_data)  

# log = LogisticRegression(C = 100)
# log.fit(vec_train_data, train_labels)

# test_df = pd.DataFrame('Labels': [dev_labels], 'Prediction': log.predict(vec_dev_data), 'Correct_Pred': dev_labels==log.predict(vec_dev_data))
# data = {'Labels': dev_labels, 'Prediction': log.predict(vec_dev_data), 'Correct_Pred': dev_labels==log.predict(vec_dev_data)}
# test_df = pd.DataFrame(data)
# test_df.head(30)

# print confusion_matrix(dev_labels, log.predict(vec_dev_data))

## Run a model for each dreamer

We need to run a separate model for each dreamer. The models will predict if a dream comes from that dreamer or from "all-others". Since bag-of-words was working best in the above models, we will continue to use that for our baseline.

In [None]:
# Run Logisitic Regression for each dreamer

models = {}

for i in range(0,41):
    # split data
    train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=i)

    # Create a Bag-of-Words Vectorizer
    vec = CountVectorizer(vocabulary=vocab)
    vec_train_data = vec.fit_transform(train_data)
    vec_dev_data = vec.transform(dev_data)  

    ## Logistic reg
    log = LogisticRegression(C = 100)
    log.fit(vec_bow_train_data, train_labels)

    # score model
    f1_score = metrics.f1_score(dev_labels, log.predict(vec_dev_data), average='macro')
    
    # find the most-predictive features
    ### we're not using the weights currently, but might be useful/interesting later ###    
    best_feature_positions = log.coef_.argsort()[0][-5::]
    best_feature_weights = log.coef_[0][best_feature_positions.astype(int)]

    # get word labels for our features
    words = []
    for ft in best_feature_positions.astype(int):
        words.append(vec.get_feature_names()[ft])
    
    models[i] = (log, f1_score, words)

## Most Predictive Words

Now that we have a separate model for each dreamer, we can pull out the most predictive words from each model. This shows, for a given dreamer, which words are most predictive of their dreams as opposed to someone else's dream.

In [None]:
for key, (model, score, predictive_words) in models.iteritems():
    print 'W266ID='+str(key), '\tMost Predictive', predictive_words

## Predict Each Dreamer

We can use our models to predict which person a given dream came from. If we take a dream from our test set and run all 41 models on that dream, we will get probabilities of the dream coming from that person. We can then take the highest probability and make that our prediction for who the dream came from. 

In [None]:
# predict dreamer for each test dream
vec_test_data = vec.transform(test_data)

preds = []
# for dream in range(len(test_labels)):
for dream in range(len(test_labels)):
    highest_prob = 0
    
    # predicted probability of the correct label for each model
    for key, (model, score, w) in models.iteritems():
        prob_correct =  model.predict_proba(vec_test_data[dream])[0][1]
        if prob_correct > highest_prob:
            highest_prob = prob_correct
            pred = key
    
    preds.append(pred)
    
print sum(preds == df['W266ID'][20800:]) / len(preds)

We got an 82% success rate on our test data.

In [None]:
df[df["W266ID"]!=0][['W266ID', 'Name']].sort_values(by= ["W266ID"]).drop_duplicates()

In [70]:
dreams = list(df['Dream'])

# Split the documents into tokens.
for idx in range(len(dreams)):
    dreams[idx] = dreams[idx].lower()  # Convert to lowercase.
    dreams[idx] = nltk.tokenize.word_tokenize(dreams[idx])  # Split into words.
    dreams[idx] = nltk.pos_tag(dreams[idx])  # tag with PoS
    dreams[idx] = [token for token, tag in dreams[idx] if tag.startswith('N')]   # only keep nouns 
    
# Remove numbers, but not words that contain numbers.
dreams = [[token for token in dream if not token.isdigit()] for dream in dreams]

# Remove words that are only one or two characters.
dreams = [[token for token in dream if len(token) > 2] for dream in dreams]

In [72]:
# Lemmatize the dreams.
lmtzr = WordNetLemmatizer()
dreams = [[lmtzr.lemmatize(token) for token in dream] for dream in dreams]

In [None]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the dreams.
dictionary = Dictionary(dreams)

# Filter out words that occur less than 10 dreams, or more than 60% of the dreams.
dictionary.filter_extremes(no_below=10, no_above=0.6)

# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for dream in dreams]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of dreams: %d' % len(corpus))

__BELOW IS FROM: https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html__

__HELPS EXPLAIN THE PARAMETERS__

## Training

*We are ready to train the LDA model. We will first discuss how to set some of the training parameters.*

*First of all, the elephant in the room: how many topics do I need? There is really no easy answer for this, it will depend on both your data and your application. I have used 10 topics here because I wanted to have a few topics that I could interpret and "label", and because that turned out to give me reasonably good results. You might not need to interpret all your topics, so you could use a large number of topics, for example 100.*

*The `chunksize` controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory. I've set chunksize = 2000, which is more than the amount of documents, so I process all the data in one go. Chunksize can however influence the quality of the model, as discussed in Hoffman and co-authors [2], but the difference was not substantial in this case.*

*`passes` controls how often we train the model on the entire corpus. Another word for passes might be "epochs". iterations is somewhat technical, but essentially it controls how often we repeat a particular loop over each document. It is important to set the number of "passes" and "iterations" high enough.*

*I suggest the following way to choose iterations and passes. First, enable logging (as described in many Gensim tutorials), and set eval_every = 1 in LdaModel. When training the model look for a line in the log that looks something like this: *

    `2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations`

*If you set `passes` = 20 you will see this line 20 times. Make sure that by the final passes, most of the documents have converged. So you want to choose both passes and iterations to be high enough for this to happen.*

*We set `alpha = 'auto'` and `eta = 'auto'`. Again this is somewhat technical, but essentially we are automatically learning two parameters in the model that we usually would have to specify explicitly. *

In [105]:
# Train LDA model.

from gensim.models import LdaModel

# Set training parameters.
num_topics = 20
chunksize = 20000
passes = 50
iterations = 500
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
# temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                        eta=0.1, iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

Wall time: 8min 46s


In [106]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
# avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
# print('Average topic coherence: %.4f.' % avg_topic_coherence)

# from pprint import pprint
# pprint(top_topics)

ZeroDivisionError: float division by zero

In [107]:
# shape: num_topics x vocabulary_size
# print model.get_topics()

model.print_topics()

[(0,
  u'0.000*"crutch" + 0.000*"governor" + 0.000*"fun" + 0.000*"error" + 0.000*"behavior" + 0.000*"match" + 0.000*"bikini" + 0.000*"pace" + 0.000*"rope" + 0.000*"ross"'),
 (1,
  u'0.000*"crutch" + 0.000*"governor" + 0.000*"fun" + 0.000*"error" + 0.000*"behavior" + 0.000*"match" + 0.000*"bikini" + 0.000*"pace" + 0.000*"rope" + 0.000*"ross"'),
 (2,
  u'0.000*"crutch" + 0.000*"governor" + 0.000*"fun" + 0.000*"error" + 0.000*"behavior" + 0.000*"match" + 0.000*"bikini" + 0.000*"pace" + 0.000*"rope" + 0.000*"ross"'),
 (3,
  u'0.000*"crutch" + 0.000*"governor" + 0.000*"fun" + 0.000*"error" + 0.000*"behavior" + 0.000*"match" + 0.000*"bikini" + 0.000*"pace" + 0.000*"rope" + 0.000*"ross"'),
 (4,
  u'0.000*"crutch" + 0.000*"governor" + 0.000*"fun" + 0.000*"error" + 0.000*"behavior" + 0.000*"match" + 0.000*"bikini" + 0.000*"pace" + 0.000*"rope" + 0.000*"ross"'),
 (5,
  u'0.000*"crutch" + 0.000*"governor" + 0.000*"fun" + 0.000*"error" + 0.000*"behavior" + 0.000*"match" + 0.000*"bikini" + 0.000*"p