# Imports

In [1]:
import sys
sys.path.append('../')
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import spacy
from spacy.lang.en import English
from spacy.attrs import POS
nlp = spacy.load('en_core_web_lg')

In [2]:
from nlp_utils import get_feats, lemmatize_doc
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

TODO: do something more principled here...

In [None]:
def stop (t) :
    return t.is_stop or t.lemma_ in ['person', 'tangrams', 'tangram', 'look', 'like', 'tap', 'choose', 'zzz', 'xxx', 'yyy', 'pick', 'guy', 'blue', 'box']

# pre-process text by lemmatizing

In [4]:
d_raw = pd.read_csv('../data/deidentified/combined_clean.csv', encoding='latin-1')

In [5]:
d_raw['text'] = [nlp(text) for text in d_raw['utterance']]
d_raw['non_stop_text'] = [[token for token in text if not token.is_stop] for text in d_raw['text']]

TypeError: object of type 'float' has no len()

In [None]:
d_raw['lemmatized_nonstop'] = [lemmatize_doc(parsed_text) for parsed_text in d_raw['non_stop_text']]

In [None]:
d_raw.head()

In [None]:
d = d_raw.copy()
gameidList = pd.unique(d.subid.ravel()).tolist()
tangramList = pd.unique(d.target.ravel()).tolist()

### Look at where conventions were introduced

The first-order case is where you just look at how often final words occurred at each previous round (percentage).

Then we can look at *first* round for each particular word... 

Then we can check who was speaker on that first round.

TODO: check cases where the 'matcher' on a round may have introduced a word

TODO: Split on PMI distribution as more systematic way of finding stop words... 

TODO: look at P(r + 1 | r) for local 'stickiness'

TODO: potentially add back in messages sent after selection

In [None]:
rows = []

def stop (t) :
    return t.is_stop or t.lemma_ in ['person', 'look', 'like', 'tap', 'choose', 'zzz', 'xxx', 'yyy', 'pick', 'guy', 'blue', 'box']

# For each game, look at referring expressions produced by director on later round
for name, df in d.query('role == "director"').groupby(['subid', 'rep_num']) :
    for i, row in df.sort_values('target').reset_index().iterrows() :
        later_rep = row['rep_num']
        target = row['target']
        content_words = np.unique(
            [t.lemma_ for t in row.text 
             if t.pos_ in ["NOUN", "ADJ", 'VERB'] 
             and not stop(t)]
        )
        query_str = 'target == "{}"'.format(target)
        for j, word in enumerate(content_words) :
            for earlier_rep in range(1, later_rep) :
                earlier_df = d.query('rep_num == {} and role == "director" and subid == "{}"'
                                     .format(earlier_rep, name[0])).sort_values('target').reset_index()
                match = word in np.array(list(earlier_df.query(query_str)['lemmas'])).flatten()
                rows.append([row['experiment'], row['subid'], row['target'], row['person'], row['age'], earlier_rep, later_rep, word, match])

In [None]:
words_df = pd.DataFrame(rows,
    columns = ['experiment', 'subid', 'target', 'final_round_person', 'age',
               'earlier_rep', 'later_rep',  'word', 'match']
)

In [None]:
words_df.to_csv('../deidentified/word_matches.csv', index=False)

We can also look at the inverse: probability of words on current round appearing at end... 

In [None]:
rows = []

# For each game, look at referring expressions produced by director on final round
for name, rep_df in d.query('role == "director"').groupby(['subid', 'rep_num']) :
    rep_df = rep_df.sort_values('target').reset_index()
    final_df = d.query('rep_num == 4 and role == "director" and subid == "{}"'.format(name[0])).sort_values('target').reset_index()
    
    # For each word used with each tangram, check whether it occured in each earlier round
    for i, row in rep_df.iterrows() :
        target = row['target']
        content_words = [t.lemma_ for t in row.text 
                         if t.pos_ in ["NOUN", "ADJ", 'VERB'] 
                         and not stop(t)]
        print('content', content_words)
        query_str = 'target == "{}"'.format(target)
        print(np.array(list(final_df.query(query_str)['lemmas'])).flatten())
        for j, word in enumerate(content_words) :
            final_match = word in np.array(list(final_df.query(query_str)['lemmas'])).flatten()
            rows.append([row['experiment'], row['subid'], row['rep_num'], row['target'], row['person'], row['age'], word, final_match])

In [None]:
words_df = pd.DataFrame(rows,
    columns = ['experiment', 'subid', 'rep_num', 'target', 'person', 'age', 'word', 'final_match']
)
words_df.to_csv('../deidentified/inverse_word_matches.csv')

# Examine semantic embeddings
We'd like to pull out bag of words embeddings from NPs in each utterance in the cued dataset and cluster them for each tangram; expect to see different pairs in different parts of the space (i.e. to compute a d' for an 'idiosyncracy' or 'multiple equilibria' result) and also different utterances from single games closer together. 

In [30]:
pd.read_csv('../data/deidentified/combined_clean.csv', encoding='latin-1')

Unnamed: 0,subid,trial,person,role,target,rep_num,age,experiment,utterance,correct,director
0,4,1,child,matcher,K1,1,6,adult-child,,True,parent
1,4,1,parent,director,K1,1,6,adult-child,so mine looks like a guy with a little hat and...,True,parent
2,4,2,child,director,C1,1,6,adult-child,Mine looks like a guy mine looks kinda like a ...,True,child
3,4,2,parent,matcher,C1,1,6,adult-child,,True,child
4,4,3,parent,director,I1,1,6,adult-child,Mine looks like a guy carrying a tray saying w...,True,parent
...,...,...,...,...,...,...,...,...,...,...,...
4124,119,36,right,director,G1,4,adult,adult-adult,it's not the person jumping,True,parent
4125,119,37,left,director,E1,4,adult,adult-adult,it's the person jumping,True,parent
4126,119,38,right,director,C1,4,adult,adult-adult,it's the person kneeling all the way down,True,parent
4127,119,39,left,director,D1,4,adult,adult-adult,it's the person in the airplane,True,parent


In [31]:
d_raw = (pd.read_csv('../data/deidentified/combined_clean.csv', encoding='latin-1')
         .query('utterance != "x"')
         .reset_index())


In [34]:
d_raw = d_raw[pd.notnull(d_raw.utterance)]

In [36]:
# concatenate
d_raw.groupby(['subid','trial', 'target', 'age', 'rep_num', 'experiment', 'director'])['utterance'].agg(utterance=' '.join).reset_index()

Unnamed: 0,subid,trial,target,age,rep_num,experiment,director,utterance
0,4,1,K1,6,1,adult-child,parent,so mine looks like a guy with a little hat and...
1,4,2,C1,6,1,adult-child,child,Mine looks like a guy mine looks kinda like a ...
2,4,3,I1,6,1,adult-child,parent,Mine looks like a guy carrying a tray saying w...
3,4,4,E1,6,1,adult-child,child,Mine looks like a person jumping for joy
4,4,5,H1,6,1,adult-child,parent,Mine looks like someone's hungry and is like c...
...,...,...,...,...,...,...,...,...
3252,119,36,G1,adult,4,adult-adult,parent,it's not the person jumping
3253,119,37,E1,adult,4,adult-adult,parent,it's the person jumping
3254,119,38,C1,adult,4,adult-adult,parent,it's the person kneeling all the way down
3255,119,39,D1,adult,4,adult-adult,parent,it's the person in the airplane


In [37]:
d_raw['text'] = [nlp(text) for text in d_raw['utterance']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [38]:
d_raw['contentful'] = [[t.lemma_ for t in text 
                        if t.is_alpha and 
                        t.pos_ not in ['PRON', 'DET', 'CCONJ', 'ADP', 'AUX', 'PUNCT']] 
                       for text in d_raw['text']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [39]:
null_embedding = np.full((1,300), np.nan)
def get_feats(d_in, nlp, scramble = False) :
    # only look at director utterances
    d = d_in.copy()

    # initialize feature vector
    raw_avg_feats = np.array([]).reshape(0, 300)

    if scramble :
        d = scramble_words(d)
        
    for i, row in d.iterrows() :
        local_embedding = np.array([]).reshape(0, 300)
        for token in row['contentful'] :
            if nlp(token).has_vector and sum(nlp(token).vector) != 0:
                local_embedding = np.vstack((local_embedding, nlp(token).vector))
            else :
                print(i, '/', d.shape[0])
                print(row['utterance'])
                print(row['contentful'])
                print('no vector available:', nlp(token))

        # average them together, handling empty lists
        if row['contentful'] :
            raw_avg_embedding = np.nanmean(local_embedding, axis = 0) 
        else :
            
            raw_avg_embedding = null_embedding.copy()
            row['is_null'] = True
            
        # add to overall list
        raw_avg_feats = np.vstack((raw_avg_feats, raw_avg_embedding))
    return d, raw_avg_feats


In [40]:
meta, raw_avg_feats = get_feats(d_raw, nlp)

119 / 3728
I see a guy he's holding his hands that way this is the last one Like he's pointing his hands on the right I think he's like holding a gondora he's holding the square going that way Like he's not holding it in his hands like he's not holding anything he's not holding anything over there and he's not over there and he's not doing that he's over there It looks like a guy who has a square on his head and it looks like a square down pointing that way
['see', 'guy', 'hold', 'hand', 'way', 'last', 'one', 'like', 'point', 'hand', 'right', 'think', 'like', 'hold', 'gondora', 'hold', 'square', 'go', 'way', 'like', 'not', 'hold', 'hand', 'like', 'not', 'hold', 'not', 'hold', 'over', 'there', 'not', 'over', 'there', 'not', 'do', 'that', 'over', 'there', 'look', 'like', 'guy', 'square', 'head', 'look', 'like', 'square', 'point', 'way']
no vector available: gondora
1085 / 3728
this lady looks like she is wasasa, i'm just playing, it looks like she's wasasawing, it means laying down in my



1626 / 3728
It's a shape Someonerunning on their tippy toes Two feet With their knees they're running with their knees both feet back
['shape', 'someonerunne', 'tippy', 'toe', 'two', 'foot', 'knee', 'run', 'knee', 'foot', 'back']
no vector available: someonerunne
1657 / 3728
OK this is a person leaning over with his toushie out and a tray in his hand and he's wearing a hat
['ok', 'person', 'lean', 'toushie', 'tray', 'hand', 'wear', 'hat']
no vector available: toushie
2066 / 3728
this one is with somebodysomebody that has a hind legand their hand is out
['one', 'somebodysomebody', 'hind', 'legand', 'hand', 'out']
no vector available: somebodysomebody
2105 / 3728
This one is somebodythat's putting out their arm
['one', 'somebodythat', 'put', 'arm']
no vector available: somebodythat
2122 / 3728
i have two triangles on there and those things and i have a big triangle right there that doesn't have that line and i have a square on top that looks like this that these are legs and this is a he

In [41]:
meta

Unnamed: 0,index,subid,trial,person,role,target,rep_num,age,experiment,utterance,correct,director,text,contentful
1,1,4,1,parent,director,K1,1,6,adult-child,so mine looks like a guy with a little hat and...,True,parent,"(so, mine, looks, like, a, guy, with, a, littl...","[so, mine, look, like, guy, little, hat, go]"
2,2,4,2,child,director,C1,1,6,adult-child,Mine looks like a guy mine looks kinda like a ...,True,child,"(Mine, looks, like, a, guy, mine, looks, kinda...","[Mine, look, like, guy, look, kinda, like, guy..."
4,4,4,3,parent,director,I1,1,6,adult-child,Mine looks like a guy carrying a tray saying w...,True,parent,"(Mine, looks, like, a, guy, carrying, a, tray,...","[Mine, look, like, guy, carry, tray, say, like]"
5,5,4,4,child,director,E1,1,6,adult-child,Mine looks like a person jumping for joy,True,child,"(Mine, looks, like, a, person, jumping, for, joy)","[Mine, look, like, person, jump, joy]"
6,6,4,5,parent,director,H1,1,6,adult-child,Mine looks like someone's hungry and is like c...,True,parent,"(Mine, looks, like, someone, 's, hungry, and, ...","[Mine, look, like, hungry, like, please, more]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4124,4124,119,36,right,director,G1,4,adult,adult-adult,it's not the person jumping,True,parent,"(it, 's, not, the, person, jumping)","[not, person, jump]"
4125,4125,119,37,left,director,E1,4,adult,adult-adult,it's the person jumping,True,parent,"(it, 's, the, person, jumping)","[person, jump]"
4126,4126,119,38,right,director,C1,4,adult,adult-adult,it's the person kneeling all the way down,True,parent,"(it, 's, the, person, kneeling, all, the, way,...","[person, kneel, way, down]"
4127,4127,119,39,left,director,D1,4,adult,adult-adult,it's the person in the airplane,True,parent,"(it, 's, the, person, in, the, airplane)","[person, airplane]"


In [42]:
(meta
 .drop(columns = [ 'utterance', 'contentful'])
 .to_csv('../data/deidentified/meta_tangrams_embeddings.csv'))
np.save('../data/deidentified/feats_tangrams_embeddings_rawavg.npy', raw_avg_feats)

# Look at tsne visualization


In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
tsne = TSNE(n_components = 2)
big_pca = PCA(n_components = 40)
viz_pca = PCA(n_components = 2)
mds = MDS(n_components=2)

In [None]:
embedding_viz = pd.DataFrame(
    columns = ['subid', 'target', 'trial', 'rep_num', 'x_tsne', 'y_tsne', 'x_mds', 'y_mds', 'feats_type']
)

for name, group in meta.reset_index(drop=True).groupby('target') :
    tangram_inds = np.array(group.index)
    feats = raw_avg_feats
    relevant_feats = feats[tangram_inds]
    
    # You can't run tsne with NANs, so we have to take them out and then add them back in...
    nan_rows = [i for i in range(relevant_feats.shape[0]) if pd.isna(relevant_feats[i,0])]
    nan_insert_rows = [k - lag for (lag, k) in enumerate(nan_rows)]
    X = np.ma.masked_invalid(relevant_feats)
    tsne_out = tsne.fit_transform(big_pca.fit_transform(np.ma.compress_rows(X)))
    tsne_out = np.insert(tsne_out, nan_insert_rows, np.nan, axis=0)
    X_tsne = pd.DataFrame(tsne_out, columns = ['x_tsne', 'y_tsne'], index=tangram_inds) 
    embedding_viz = embedding_viz.append(pd.concat([group, X_tsne], axis = 1), 
                                         ignore_index=True, sort=False)


In [None]:
embedding_viz.drop(columns=['text', 'contentful']).to_csv('../data/deidentified/tsne_embeddings.csv')