# Imports

In [1]:
import sys
sys.path.append('../')
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import spacy
from spacy.lang.en import English
from spacy.attrs import POS
nlp = spacy.load('en_core_web_lg')

In [2]:
from nlp_utils import get_feats, lemmatize_doc
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

TODO: do something principled here...

In [45]:
def stop (t) :
    return t.is_stop or t.lemma_ in ['person', 'tangrams', 'tangram', 'look', 'like', 'tap', 'choose', 'zzz', 'xxx', 'yyy', 'pick', 'guy', 'blue', 'box']

# pre-process text by lemmatizing

In [118]:
d_raw = pd.read_csv('../deidentified/combined.csv', encoding='latin-1')

In [119]:
d_raw['text'] = [nlp(text) for text in d_raw['utterance']]
d_raw['non_stop_text'] = [[token for token in text if not token.is_stop] for text in d_raw['text']]

In [7]:
d_raw['lemmatized_nonstop'] = [lemmatize_doc(parsed_text) for parsed_text in d_raw['non_stop_text']]

In [16]:
d_raw.head()

Unnamed: 0,subid,trial,person,role,target,rep_num,age,experiment,utterance,correct,text,non_stop_text,lemmas,contentful
0,4,1,child,matcher,K1,1,6.0,adult-child,zzz,True,(zzz),[zzz],[zzz],[]
1,4,1,parent,director,K1,1,6.0,adult-child,"OK, so you see the tangrams? Do you remember t...",True,"(OK, ,, so, you, see, the, tangrams, ?, Do, yo...","[OK, ,, tangrams, ?, remember, ?, OK, ,, looks...","[ok, tangrams, remember, ok, look, like, guy, ...","[tangrams, remember, little, hat, go]"
2,4,2,child,director,C1,1,6.0,adult-child,Mine looks like a guy bendi--mine looks kinda ...,True,"(Mine, looks, like, a, guy, bendi, --, mine, l...","[looks, like, guy, bendi, --, looks, kinda, li...","[look, like, guy, bendi, look, kinda, like, gu...",[bend]
3,4,2,parent,matcher,C1,1,6.0,adult-child,Oh I see,True,"(Oh, I, see)",[Oh],[oh],[]
4,4,3,parent,director,I1,1,6.0,adult-child,Mine looks like a guy carrying a tray saying w...,True,"(Mine, looks, like, a, guy, carrying, a, tray,...","[looks, like, guy, carrying, tray, saying, lik...","[look, like, guy, carry, tray, say, like]","[carry, tray, say]"


In [9]:
d = d_raw.copy()
gameidList = pd.unique(d.subid.ravel()).tolist()
tangramList = pd.unique(d.target.ravel()).tolist()

### Look at where conventions were introduced

The first-order case is where you just look at how often final words occurred at each previous round (percentage).

Then we can look at *first* round for each particular word... 

Then we can check who was speaker on that first round.

TODO: check cases where the 'matcher' on a round may have introduced a word

TODO: Split on PMI distribution as more systematic way of finding stop words... 

TODO: look at P(r + 1 | r) for local 'stickiness'

TODO: potentially add back in messages sent after selection

In [124]:
rows = []

def stop (t) :
    return t.is_stop or t.lemma_ in ['person', 'look', 'like', 'tap', 'choose', 'zzz', 'xxx', 'yyy', 'pick', 'guy', 'blue', 'box']

# For each game, look at referring expressions produced by director on later round
for name, df in d.query('role == "director"').groupby(['subid', 'rep_num']) :
    for i, row in df.sort_values('target').reset_index().iterrows() :
        later_rep = row['rep_num']
        target = row['target']
        content_words = np.unique(
            [t.lemma_ for t in row.text 
             if t.pos_ in ["NOUN", "ADJ", 'VERB'] 
             and not stop(t)]
        )
        query_str = 'target == "{}"'.format(target)
        for j, word in enumerate(content_words) :
            for earlier_rep in range(1, later_rep) :
                earlier_df = d.query('rep_num == {} and role == "director" and subid == "{}"'
                                     .format(earlier_rep, name[0])).sort_values('target').reset_index()
                match = word in np.array(list(earlier_df.query(query_str)['lemmas'])).flatten()
                rows.append([row['experiment'], row['subid'], row['target'], row['person'], row['age'], earlier_rep, later_rep, word, match])



KeyboardInterrupt: 

In [15]:
words_df = pd.DataFrame(rows,
    columns = ['experiment', 'subid', 'target', 'final_round_person', 'age',
               'earlier_rep', 'later_rep',  'word', 'match']
)

In [16]:
words_df.to_csv('../deidentified/word_matches.csv', index=False)

We can also look at the inverse: probability of words on current round appearing at end... 

In [None]:
rows = []

# For each game, look at referring expressions produced by director on final round
for name, rep_df in d.query('role == "director"').groupby(['subid', 'rep_num']) :
    rep_df = rep_df.sort_values('target').reset_index()
    final_df = d.query('rep_num == 4 and role == "director" and subid == "{}"'.format(name[0])).sort_values('target').reset_index()
    
    # For each word used with each tangram, check whether it occured in each earlier round
    for i, row in rep_df.iterrows() :
        target = row['target']
        content_words = [t.lemma_ for t in row.text 
                         if t.pos_ in ["NOUN", "ADJ", 'VERB'] 
                         and not stop(t)]
        print('content', content_words)
        query_str = 'target == "{}"'.format(target)
        print(np.array(list(final_df.query(query_str)['lemmas'])).flatten())
        for j, word in enumerate(content_words) :
            final_match = word in np.array(list(final_df.query(query_str)['lemmas'])).flatten()
            rows.append([row['experiment'], row['subid'], row['rep_num'], row['target'], row['person'], row['age'], word, final_match])

In [None]:
words_df = pd.DataFrame(rows,
    columns = ['experiment', 'subid', 'rep_num', 'target', 'person', 'age', 'word', 'final_match']
)
words_df.to_csv('../deidentified/inverse_word_matches.csv')

# Examine semantic embeddings
We'd like to pull out bag of words embeddings from NPs in each utterance in the cued dataset and cluster them for each tangram; expect to see different pairs in different parts of the space (i.e. to compute a d' for an 'idiosyncracy' or 'multiple equilibria' result) and also different utterances from single games closer together. 

TODO: this is really agressive... don't remove stop words?

In [140]:
d_raw = (pd.read_csv('../data/deidentified/tangram_recording.csv')
         .rename(columns={'utterance' : 'garbage', 'record' : 'utterance'})
         .query('utterance != "x"'))
d_raw = d_raw[pd.notnull(d_raw.utterance)]

Unnamed: 0,subid,type,trial,target,person,garbage,selection,correct,utterance
0,4,test,8.0,A1,child,Mine looks like something walking away zzz,,Y,mine looks like something walking away
1,4,test,34.0,A1,parent,And now my person is just walking down the str...,,Y,And now my person is just walking down the str...
2,4,test,6.0,B1,child,My mine looks like a person that has their han...,,Y,mine looks like a person that has their hands ...
4,4,test,2.0,C1,child,Mine looks like a guy bendi--mine looks kinda ...,,Y,mine looks kinda like a guy bending down
5,4,test,36.0,C1,parent,"My person is diving into the pool, with their ...",,Y,"My person is diving into the pool, with their ..."
...,...,...,...,...,...,...,...,...,...
1346,74,test,8.0,D1,child,"oh it's on the ground, oh it's my turn, it's u...",,Y,it is a man on the ground and one foot is up
1354,74,test,1.0,H1,parent,"it looks like the person is pointing, um, or t...",,Y,it looks like the person is pointing
1355,74,test,39.0,H1,child,it's something that's look like it's giving to...,,Y,it looks like something giving you a coin
1369,75,test,33.0,D1,child,race- floating racecar,,Y,a floating racecar


In [159]:
d_raw['text'] = [nlp(text) for text in d_raw['utterance']]
d_raw['contentful'] = [[t.lemma_ for t in text] 
                       for text in d_raw['text']]

In [160]:
d_raw

Unnamed: 0,subid,type,trial,target,person,garbage,selection,correct,utterance,text,contentful
0,4,test,8.0,A1,child,Mine looks like something walking away zzz,,Y,mine looks like something walking away,"(mine, looks, like, something, walking, away)","[mine, look, like, something, walk, away]"
1,4,test,34.0,A1,parent,And now my person is just walking down the str...,,Y,And now my person is just walking down the str...,"(And, now, my, person, is, just, walking, down...","[and, now, -PRON-, person, be, just, walk, dow..."
2,4,test,6.0,B1,child,My mine looks like a person that has their han...,,Y,mine looks like a person that has their hands ...,"(mine, looks, like, a, person, that, has, thei...","[mine, look, like, a, person, that, have, -PRO..."
4,4,test,2.0,C1,child,Mine looks like a guy bendi--mine looks kinda ...,,Y,mine looks kinda like a guy bending down,"(mine, looks, kinda, like, a, guy, bending, down)","[mine, look, kinda, like, a, guy, bend, down]"
5,4,test,36.0,C1,parent,"My person is diving into the pool, with their ...",,Y,"My person is diving into the pool, with their ...","(My, person, is, diving, into, the, pool, ,, w...","[-PRON-, person, be, dive, into, the, pool, ,,..."
...,...,...,...,...,...,...,...,...,...,...,...
1346,74,test,8.0,D1,child,"oh it's on the ground, oh it's my turn, it's u...",,Y,it is a man on the ground and one foot is up,"(it, is, a, man, on, the, ground, and, one, fo...","[-PRON-, be, a, man, on, the, ground, and, one..."
1354,74,test,1.0,H1,parent,"it looks like the person is pointing, um, or t...",,Y,it looks like the person is pointing,"(it, looks, like, the, person, is, pointing)","[-PRON-, look, like, the, person, be, point]"
1355,74,test,39.0,H1,child,it's something that's look like it's giving to...,,Y,it looks like something giving you a coin,"(it, looks, like, something, giving, you, a, c...","[-PRON-, look, like, something, give, -PRON-, ..."
1369,75,test,33.0,D1,child,race- floating racecar,,Y,a floating racecar,"(a, floating, racecar)","[a, float, racecar]"


In [161]:
null_embedding = np.full((1,300), np.nan)
def get_feats(d_in, nlp, scramble = False) :
    # only look at director utterances
    d = d_in.copy()

    # initialize feature vector
    raw_avg_feats = np.array([]).reshape(0, 300)

    if scramble :
        d = scramble_words(d)
        
    for i, row in d.iterrows() :
        local_embedding = np.array([]).reshape(0, 300)
        for token in row['contentful'] :
            if nlp(token).has_vector and sum(nlp(token).vector) > 0:
                local_embedding = np.vstack((local_embedding, nlp(token).vector))

        # average them together, handling empty lists
        if row['contentful'] :
            raw_avg_embedding = np.nanmean(local_embedding, axis = 0) 
        else :
            raw_avg_embedding = null_embedding.copy()
            row['is_null'] = True
            
        # add to overall list
        raw_avg_feats = np.vstack((raw_avg_feats, raw_avg_embedding))
    return d, raw_avg_feats


In [162]:
meta, raw_avg_feats = get_feats(d_raw, nlp)



In [163]:
(meta
 .drop(columns = [ 'utterance', 'contentful'])
 .to_csv('../data/deidentified/meta_tangrams_embeddings.csv'))
np.save('../data/deidentified/feats_tangrams_embeddings_rawavg.npy', raw_avg_feats)

TODO: initial distribution w/in vs. across
TODO: 2D PCA... (traces of beginnings and ends)
-- Connect individuals in a game with a line!
-- Word clouds for initial and final

# Look at tsne visualization
TODO: there are a bunch of problems with this: a lot of the creative utterances don't exist in current embedding (e.g. "ghostman"), sometimes they don't converge to a noun (e.g. "flying"), etc.

In [148]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
tsne = TSNE(n_components = 2)
big_pca = PCA(n_components = 40)
viz_pca = PCA(n_components = 2)
mds = MDS(n_components=2)

In [152]:
embedding_viz = pd.DataFrame(
    columns = ['subid', 'target', 'trial', 'rep_num', 'x_tsne', 'y_tsne', 'x_mds', 'y_mds', 'feats_type']
)

for name, group in meta.reset_index(drop=True).groupby('target') :
    tangram_inds = np.array(group.index)
    feats = raw_avg_feats
    relevant_feats = feats[tangram_inds]
    
    # You can't run tsne with NANs, so we have to take them out and then add them back in...
    nan_rows = [i for i in range(relevant_feats.shape[0]) if pd.isna(relevant_feats[i,0])]
    nan_insert_rows = [k - lag for (lag, k) in enumerate(nan_rows)]
    X = np.ma.masked_invalid(relevant_feats)
    tsne_out = tsne.fit_transform(big_pca.fit_transform(np.ma.compress_rows(X)))
    tsne_out = np.insert(tsne_out, nan_insert_rows, np.nan, axis=0)
    X_tsne = pd.DataFrame(tsne_out, columns = ['x_tsne', 'y_tsne'], index=tangram_inds) 
    embedding_viz = embedding_viz.append(pd.concat([group, X_tsne], axis = 1), 
                                         ignore_index=True, sort=False)


In [153]:
embedding_viz.drop(columns=['text', 'contentful']).to_csv('../data/deidentified/tsne_embeddings.csv')