# Imports

In [None]:
import sys
sys.path.append('../')
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import spacy
from spacy.lang.en import English
from spacy.attrs import POS
nlp = spacy.load('en_core_web_lg')

In [None]:
from nlp_utils import get_feats, lemmatize_doc
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

# pre-process text by lemmatizing

In [None]:
d_raw = pd.read_csv('../deidentified/combined.csv', encoding='latin-1')#.rename(index=str, columns={"contents": "text"})

In [None]:
d_raw['text'] = [[token for token in nlp(text) if not token.is_stop] for text in d_raw['utterance']]

In [None]:
d_raw['lemmas'] = [lemmatize_doc(parsed_text) for parsed_text in d_raw['text']]

In [None]:
d_raw.head()

In [None]:
d = d_raw.copy()
gameidList = pd.unique(d.subid.ravel()).tolist()
tangramList = pd.unique(d.target.ravel()).tolist()

### Look at where conventions were introduced

The first-order case is where you just look at how often final words occurred at each previous round (percentage).

Then we can look at *first* round for each particular word... 

Then we can check who was speaker on that first round.

TODO: check cases where the 'matcher' on a round may have introduced a word

In [None]:
rows = []

# For each game, look at referring expressions produced by director on final round
for name, final_df in d.query('rep_num == 4 and role == "director"').groupby('subid') :
    final_df = final_df.sort_values('target').reset_index()
    round1_df = d.query('rep_num == 1 and role == "director" and subid == "{}"'.format(name)).sort_values('target').reset_index()
    round2_df = d.query('rep_num == 2 and role == "director" and subid == "{}"'.format(name)).sort_values('target').reset_index()
    round3_df = d.query('rep_num == 3 and role == "director" and subid == "{}"'.format(name)).sort_values('target').reset_index()
    
    # For each word used with each tangram, check whether it occured in each earlier round
    for i, row in final_df.iterrows() :
        target = row['target']
        words = np.unique(row['lemmas'])
        for j, word in enumerate(words) :
            round_1_match = word in np.array(list(round1_df.query('target == "{}"'.format(target))['lemmas'])).flatten()
            round_2_match = word in np.array(list(round2_df.query('target == "{}"'.format(target))['lemmas'])).flatten()
            round_3_match = word in np.array(list(round3_df.query('target == "{}"'.format(target))['lemmas'])).flatten()
            rows.append([row['subid'], row['target'], row['person'], row['age'], word, round_1_match, round_2_match, round_3_match])

In [None]:
words_df = pd.DataFrame(rows,
    columns = ['subid', 'target', 'final_round_person', 'age', 'word', '1_match', '2_match', '3_match']
)

In [None]:
words_df.to_csv('../deidentified/word_matches.csv')

### create tf-idf weightings

In [None]:
len(tangramList)

In [None]:
rows = []

# For each game, look at referring expressions produced by director 
for name, game_df in d.query('role == "director"').groupby('subid') :
    final_df = game_df.query('rep_num == 4').sort_values('target').reset_index()
    for rep_num in range(1,4) :
        earlier_df = game_df.query('rep_num == {}'.format(rep_num)).sort_values('target').reset_index()

        docs_dict = Dictionary(earlier_df['lemmas'])
        docs_corpus = [docs_dict.doc2bow(doc) for doc in earlier_df['lemmas'] if not np.any(pd.isna(doc))]
        model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict, smartirs='btn') # note: 'btn' uses boolean for tf term)
        docs_tfidf  = model_tfidf[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
        for i, row in earlier_df.iterrows() :
            words = [docs_dict[i] for i in range(len(docs_vecs[0,]))]
            for j, word in enumerate(words) :
                tfidf = docs_vecs[i,j]
                final_match = word in np.array(list(final_df.query('target == "{}"'.format(row['target']))['lemmas'])).flatten()
                rows.append([row['subid'], row['target'], row['rep_num'], row['person'], row['age'], word, tfidf, final_match])

In [None]:
rows

In [None]:
informativity_df = pd.DataFrame(rows,
    columns = ['subid', 'target', 'rep_num', 'person', 'age', 'word', 'tfidf', 'final_match']
)

In [None]:
informativity_df.to_csv('../deidentified/informativity.csv')

# Examine semantic embeddings
We'd like to pull out bag of words embeddings from NPs in each utterance in the cued dataset and cluster them for each tangram; expect to see different pairs in different parts of the space (i.e. to compute a d' for an 'idiosyncracy' or 'multiple equilibria' result) and also different utterances from single games closer together. 

In [None]:
from utils.nlp_utils import get_feats
meta, raw_avg_feats, weighted_feats = get_feats(d, docs_emb, nlp)

In [None]:
assert(weighted_feats.shape[0] == meta.shape[0] )
assert(raw_avg_feats.shape[0] == meta.shape[0])

In [None]:
meta.to_csv('outputs/meta_tangrams_embeddings.csv')
np.save('outputs/feats_tangrams_embeddings_tfidf.npy', weighted_feats)#, delimiter=',')
np.save('outputs/feats_tangrams_embeddings_rawavg.npy', raw_avg_feats)#, delimiter=',')

TODO: initial distribution w/in vs. across
TODO: 2D PCA... (traces of beginnings and ends)
-- Connect individuals in a game with a line!
-- Word clouds for initial and final

# Look at tsne visualization
TODO: there are a bunch of problems with this: a lot of the creative utterances don't exist in current embedding (e.g. "ghostman"), sometimes they don't converge to a noun (e.g. "flying"), etc.

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
tsne = TSNE(n_components = 2)
big_pca = PCA(n_components = 50)
viz_pca = PCA(n_components = 2)
mds = MDS(n_components=2)

In [None]:
embedding_viz = pd.DataFrame(
    columns = ['gameid', 'intendedName', 'repetitionNum', 'x_tsne', 'y_tsne', 'x_mds', 'y_mds', 'feats_type']
)

for name, group in meta.groupby('intendedName') :
    tangram_inds = np.array(group.index)
    for feats_type in ['raw_avg', 'weighted'] :
        feats = weighted_feats if feats_type == 'weighted' else raw_avg_feats
        relevant_feats = feats[tangram_inds]
        nan_rows = [i for i in range(relevant_feats.shape[0]) if pd.isna(relevant_feats[i,0])]
        nan_insert_rows = [k - lag for (lag, k) in enumerate(nan_rows)]
        X = np.ma.masked_invalid(relevant_feats)
        tsne_out = tsne.fit_transform(big_pca.fit_transform(np.ma.compress_rows(X)))
        tsne_out = np.insert(tsne_out, nan_insert_rows, np.nan, axis=0)
        X_tsne = pd.DataFrame(tsne_out, 
                             columns = ['x_tsne', 'y_tsne'], 
                             index=tangram_inds) #X_mds, 
        X_tsne['feats_type'] = feats_type
        embedding_viz = embedding_viz.append(pd.concat([group, X_tsne], axis = 1), 
                                             ignore_index=True, sort=False)


In [None]:
embedding_viz.to_csv('outputs/embeddings.csv')