# Imports

In [434]:
import sys
sys.path.append('../')
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import spacy
from spacy.lang.en import English
from spacy.attrs import POS
nlp = spacy.load('en_core_web_lg')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [429]:
from utils.nlp_utils import get_feats

# pre-process text by lemmatizing

In [405]:
def keep_token(t):
    return (t.is_alpha and 
            not (t.is_space or t.is_punct))

def lemmatize_doc(doc):
    return [ t.lemma_ for t in doc if keep_token(t)]

In [176]:
version_to_use = 'tangramsSequential_collapsed'
d_raw = pd.read_csv('../data/{}.csv'.format(version_to_use))#.rename(index=str, columns={"contents": "text"})
d_raw['text'] = [nlp(text) for text in d_raw['contents']]
d_raw['lemmas'] = [lemmatize_doc(parsed_text) for parsed_text in d_raw['text']]
docs_dict = Dictionary(d_raw['lemmas'])

we're missing rows so we need to 'fill in' the content so that it'll be NANs

In [484]:
d = d_raw.copy()
d = d.set_index(['gameid','intendedName', 'repetitionNum'])
mux = pd.MultiIndex.from_product([d.index.levels[0], d.index.levels[1],d.index.levels[2]], names=['gameid','intendedName', 'repetitionNum'])
d = d.reindex(mux, fill_value=[np.nan]).reset_index()

nan_rows = [i for (i,row) in d.iterrows() if pd.isna(row['text'])]
nan_insert_rows = [k - lag for (lag, k) in enumerate(nan_rows)]

gameidList = pd.unique(d.gameid.ravel()).tolist()
tangramList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']

### create tf-idf weightings

In [486]:
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

docs_corpus = [docs_dict.doc2bow(doc) for doc in d['lemmas'] if not np.any(pd.isna(doc))]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf  = model_tfidf[docs_corpus]
docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
tfidf_emb_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))])

In [490]:
docs_emb_raw = np.dot(docs_vecs, tfidf_emb_vecs) 
docs_emb = np.insert(docs_emb_raw, nan_insert_rows, np.nan, axis=0)

In [493]:
print(nan_insert_rows)

[156, 824, 827, 834, 846, 853, 4208, 4463, 5940, 6072]


In [494]:
print([i for i in range(docs_emb.shape[0]) if pd.isna(docs_emb[i,0])])

[156, 824, 827, 834, 846, 853, 4208, 4463, 5940, 6072]


# Examine semantic embeddings
We'd like to pull out bag of words embeddings from NPs in each utterance in the cued dataset and cluster them for each tangram; expect to see different pairs in different parts of the space (i.e. to compute a d' for an 'idiosyncracy' or 'multiple equilibria' result) and also different utterances from single games closer together. 

In [510]:
from utils.nlp_utils import get_feats
meta, raw_avg_feats, weighted_feats = get_feats(d, docs_emb, nlp)

In [497]:
assert(weighted_feats.shape[0] == meta.shape[0] )
assert(raw_avg_feats.shape[0] == meta.shape[0])

In [498]:
meta.to_csv('outputs/meta_tangrams_embeddings.csv')
np.savetxt('outputs/feats_tangrams_embeddings_tfidf.txt', weighted_feats, delimiter=',')
np.savetxt('outputs/feats_tangrams_embeddings_rawavg.txt', raw_avg_feats, delimiter=',')

TODO: initial distribution w/in vs. across
TODO: variance across pairs at beginning & end
TODO: 2D PCA... (traces of beginnings and ends)
-- Connect individuals in a game with a line!
-- Word clouds for initial and final

# Look at tsne visualization
TODO: there are a bunch of problems with this: a lot of the creative utterances don't exist in current embedding (e.g. "ghostman"), sometimes they don't converge to a noun (e.g. "flying"), etc.

In [499]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
tsne = TSNE(n_components = 2)
big_pca = PCA(n_components = 50)
viz_pca = PCA(n_components = 2)
mds = MDS(n_components=2)

In [509]:
len([i for i in range(meta.shape[0]) if pd.isna(weighted_feats[i,0])])
[token.pos_ for token in nlp('facing right')]

['VERB', 'ADV']

In [503]:
embedding_viz = pd.DataFrame(
    columns = ['gameid', 'intendedName', 'repetitionNum', 'x_tsne', 'y_tsne', 'x_mds', 'y_mds', 'feats_type']
)

for name, group in meta.groupby('intendedName') :
    tangram_inds = np.array(group.index)
    for feats_type in ['raw_avg', 'weighted'] :
        feats = weighted_feats if feats_type == 'weighted' else raw_avg_feats
        relevant_feats = feats[tangram_inds]
        nan_rows = [i for i in range(relevant_feats.shape[0]) if pd.isna(relevant_feats[i,0])]
        nan_insert_rows = [k - lag for (lag, k) in enumerate(nan_rows)]
#         print(len(nan_rows))
#         print(relevant_feats[nan_rows])
        X = np.ma.masked_invalid(relevant_feats)
        tsne_out = tsne.fit_transform(big_pca.fit_transform(np.ma.compress_rows(X)))
        tsne_out = np.insert(tsne_out, nan_insert_rows, np.nan, axis=0)
#        print(tsne_out[nan_rows])
        X_tsne = pd.DataFrame(tsne_out, 
                             columns = ['x_tsne', 'y_tsne'], 
                             index=tangram_inds) #X_mds, 
        X_tsne['feats_type'] = feats_type
#        print(group[nan_insert_rows])
        embedding_viz = embedding_viz.append(pd.concat([group, X_tsne], axis = 1), 
                                             ignore_index=True, sort=False)


In [504]:
embedding_viz

Unnamed: 0,gameid,intendedName,repetitionNum,x_tsne,y_tsne,x_mds,y_mds,feats_type,correct,is_nan
0,0057-414228f8-c268-40d6-9349-b35df4f080d9,A,1,-6.817472,-8.608577,,,raw_avg,1,False
1,0057-414228f8-c268-40d6-9349-b35df4f080d9,A,2,-11.403949,-17.788462,,,raw_avg,1,False
2,0057-414228f8-c268-40d6-9349-b35df4f080d9,A,3,-11.836440,-17.912106,,,raw_avg,1,False
3,0057-414228f8-c268-40d6-9349-b35df4f080d9,A,4,-11.159099,-18.159309,,,raw_avg,1,False
4,0057-414228f8-c268-40d6-9349-b35df4f080d9,A,5,-11.856598,-18.363270,,,raw_avg,1,False
5,0057-414228f8-c268-40d6-9349-b35df4f080d9,A,6,-11.437317,-18.521347,,,raw_avg,1,False
6,0349-951c1418-40e9-48b3-8290-7ed4461f4d54,A,1,9.567870,-0.788492,,,raw_avg,0,False
7,0349-951c1418-40e9-48b3-8290-7ed4461f4d54,A,2,18.021465,2.599716,,,raw_avg,0,False
8,0349-951c1418-40e9-48b3-8290-7ed4461f4d54,A,3,21.434008,3.472485,,,raw_avg,1,False
9,0349-951c1418-40e9-48b3-8290-7ed4461f4d54,A,4,19.030233,2.384701,,,raw_avg,1,False


In [505]:
embedding_viz.to_csv('outputs/embeddings.csv')

In [None]:
TODO: compare cosine similarity distances with-in vs. across games?

TODO: compare variance across games at beginning and end