<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Merging-attributes-from-BioSample-using-word2vec" data-toc-modified-id="Merging-attributes-from-BioSample-using-word2vec-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Merging attributes from BioSample using word2vec</a></span><ul class="toc-item"><li><span><a href="#Set-up" data-toc-modified-id="Set-up-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Set-up</a></span><ul class="toc-item"><li><span><a href="#Import-needed-packages" data-toc-modified-id="Import-needed-packages-1.1.1"><span class="toc-item-num">1.1.1&nbsp;&nbsp;</span>Import needed packages</a></span></li><li><span><a href="#Helper-function-definitions" data-toc-modified-id="Helper-function-definitions-1.1.2"><span class="toc-item-num">1.1.2&nbsp;&nbsp;</span>Helper function definitions</a></span></li><li><span><a href="#Load-data-and-embedding-model" data-toc-modified-id="Load-data-and-embedding-model-1.1.3"><span class="toc-item-num">1.1.3&nbsp;&nbsp;</span>Load data and embedding model</a></span></li><li><span><a href="#Build-dataframe" data-toc-modified-id="Build-dataframe-1.1.4"><span class="toc-item-num">1.1.4&nbsp;&nbsp;</span>Build dataframe</a></span></li><li><span><a href="#Define-attribute-groupings" data-toc-modified-id="Define-attribute-groupings-1.1.5"><span class="toc-item-num">1.1.5&nbsp;&nbsp;</span>Define attribute groupings</a></span></li></ul></li><li><span><a href="#Calculate-average-embedding-vectors" data-toc-modified-id="Calculate-average-embedding-vectors-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Calculate average embedding vectors</a></span><ul class="toc-item"><li><span><a href="#Randomly-sample-100-of-each-attribute-that-occurs-greater-than-100-times" data-toc-modified-id="Randomly-sample-100-of-each-attribute-that-occurs-greater-than-100-times-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Randomly sample 100 of each attribute that occurs greater than 100 times</a></span></li><li><span><a href="#Get-average-embedding-vector-for-each-attribute" data-toc-modified-id="Get-average-embedding-vector-for-each-attribute-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Get average embedding vector for each attribute</a></span></li></ul></li><li><span><a href="#Merge-entities-with-high-cosine-similarity" data-toc-modified-id="Merge-entities-with-high-cosine-similarity-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Merge entities with high cosine similarity</a></span></li><li><span><a href="#Determine-coverage-increase-with-merging" data-toc-modified-id="Determine-coverage-increase-with-merging-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Determine coverage increase with merging</a></span><ul class="toc-item"><li><span><a href="#Build-dataframe" data-toc-modified-id="Build-dataframe-1.4.1"><span class="toc-item-num">1.4.1&nbsp;&nbsp;</span>Build dataframe</a></span></li></ul></li><li><span><a href="#Analyze-word-embedding-space" data-toc-modified-id="Analyze-word-embedding-space-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Analyze word embedding space</a></span><ul class="toc-item"><li><span><a href="#Figure3A:-PCA-of-class-based-terms" data-toc-modified-id="Figure3A:-PCA-of-class-based-terms-1.5.1"><span class="toc-item-num">1.5.1&nbsp;&nbsp;</span><font color="red">Figure3A:</font> PCA of class based terms</a></span></li><li><span><a href="#Figure3B:-Cosine-similarity-based-hierarchical-clustering-of-class-based-terms" data-toc-modified-id="Figure3B:-Cosine-similarity-based-hierarchical-clustering-of-class-based-terms-1.5.2"><span class="toc-item-num">1.5.2&nbsp;&nbsp;</span><font color="red">Figure3B:</font> Cosine similarity based hierarchical clustering of class based terms</a></span></li><li><span><a href="#Figure3C:-PCA-of-data-type-based-terms" data-toc-modified-id="Figure3C:-PCA-of-data-type-based-terms-1.5.3"><span class="toc-item-num">1.5.3&nbsp;&nbsp;</span><font color="red">Figure3C:</font> PCA of data type based terms</a></span></li><li><span><a href="#Figure3D:-Cosine-similarity-based-hierarchical-clustering-of-data-type-based-terms" data-toc-modified-id="Figure3D:-Cosine-similarity-based-hierarchical-clustering-of-data-type-based-terms-1.5.4"><span class="toc-item-num">1.5.4&nbsp;&nbsp;</span><font color="red">Figure3D:</font> Cosine similarity based hierarchical clustering of data type based terms</a></span></li><li><span><a href="#Figure1F:-Cosine-similarity-based-hierarchical-clustering-for-methodology-figure" data-toc-modified-id="Figure1F:-Cosine-similarity-based-hierarchical-clustering-for-methodology-figure-1.5.5"><span class="toc-item-num">1.5.5&nbsp;&nbsp;</span><font color="red">Figure1F:</font> Cosine similarity based hierarchical clustering for methodology figure</a></span></li></ul></li></ul></li></ul></div>

# Merging attributes from BioSample using word2vec
Adam Klie<br>
11/17/2019<br>
Script to create classes as output for neural net using similarities between attribute word embeddings

## Set-up

### Import needed packages

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.spatial as sp
import spacy

In [None]:
from matplotlib import rcParams
import matplotlib as mpl

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 600 
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 18
rcParams['patch.edgecolor'] = 'white'
rcParams['font.family'] = 'StixGeneral'

rcParams['axes.labelsize'] = 30
rcParams['ytick.labelsize'] = 24
rcParams['xtick.labelsize'] = 24

### Helper function definitions

In [None]:
def correlate_dataframes(df1, df2, metric = 'cosine' ):
    M = 1 - sp.distance.cdist(df1, df2, metric)
    return pd.DataFrame(data = M, index = df1.index, columns = df2.index)

### Load data and embedding model

In [None]:
# Load embedding model
nlp = spacy.load('../../data/wikipedia-pubmed-and-PMC-w2v')

In [None]:
# Load attribute-value pairs
qiita_dir = "../../data/qiita/allQiita.pickle"
allQiita = pd.read_pickle(qiita_dir).dropna()

### Build dataframe

In [None]:
host_name_m = allQiita.index.get_level_values(1) == 'host_common_name'
hs_m = allQiita.values == 'human'
hs = allQiita[host_name_m & hs_m].index.get_level_values(0)
hs_qiita = allQiita[allQiita.index.get_level_values(0).isin(hs)]

In [None]:
qiita_df = pd.DataFrame(hs_qiita).reset_index()
qiita_df.columns = ['sample_id', 'attribute', 'value']
del allQiita

### Define attribute groupings

In [None]:
grouping = pd.Series({'sex': 'Sex', 
                      'age': 'Age', 
                      'bmi': 'BMI', 
                      'body_site': 'Body site', 
                      'antibiotics': 'Antibiotic usage', 
                      'pregnant': 'Pregnancy status', 
                      'diet': 'Diet type', 
                      'ethnicity': 'Ethnicity'
                     })

In [None]:
model_iter = "qiita"

## Calculate average embedding vectors

### Randomly sample 100 of each attribute that occurs greater than 100 times

In [None]:
# Take only attributes with more than 100 samples to compare
qiita_df['value'] = qiita_df['value'].astype(str)
attribute_counts = qiita_df['attribute'].value_counts()
recurr_attrib = attribute_counts.index[attribute_counts > 100]
recurring_df = qiita_df[qiita_df.attribute.isin(recurr_attrib)]
del qiita_df

In [None]:
print("There are %d recurring attributes (greater than 100 samples)" % len(recurr_attrib))

In [None]:
# Randomly sample 100 per entities for generating entity vector
recurring_df = recurring_df.sample(recurring_df.shape[0])
subset_df = recurring_df.groupby('attribute').head(n = 100)
del recurring_df

### Get average embedding vector for each attribute

In [None]:
# Get vector representation for all attribute-value pairs randomly selected
docs = nlp.pipe(subset_df['value'].tolist())
subset_df['value'] = subset_df['value'].astype(str)
subset_index = subset_df.set_index(['sample_id','attribute','value']).index
del subset_df
vectors = [doc.vector for doc in docs]
del docs
doc_vector_df = pd.DataFrame(vectors, index=subset_index).astype(float)

In [None]:
# Take average of each attribute and correlate them by cosine similarity
attribute_means = doc_vector_df.groupby('attribute').mean()
embedding_df = attribute_means[attribute_means.sum(axis=1).abs()>0]
corr_df = correlate_dataframes(embedding_df, embedding_df)

In [None]:
plt.hist(corr_df.unstack().dropna(), bins = 20)
plt.xlabel("Cosine similarity")
plt.ylabel('Frequency')
plt.title('Distribution of cosine similarities of attributes')

## Merge entities with high cosine similarity

In [None]:
# Choose higher categories that will encompass merged attributes
similarity_threshold = 0.8
selected_seeds = grouping.index

In [None]:
# Find all attributes closest to the seed attributes by threshold
dataframe_list = []
for attribute in selected_seeds:
    if attribute == 'age':
        tmp_series = corr_df[attribute][corr_df[attribute].index.isin(['age', 'AGE', 'Age'])]
        tmp_df = tmp_series.sort_values(ascending = False).to_frame().reset_index()
    else:
        tmp_series = corr_df[attribute].sort_values(ascending = False)
        tmp_df = tmp_series[tmp_series>= similarity_threshold].to_frame().reset_index()
    tmp_df.columns = ['attribute','similarity']
    dataframe_list.append(tmp_df)

In [None]:
# Create a dataframe of entities that are similar
merged_df = pd.concat(dataframe_list, keys = selected_seeds, axis = 0)
merged_df.index.names=['GroupName','I']
merged_reset_df = merged_df.reset_index()
similar_df = merged_reset_df[merged_reset_df['similarity'] >= similarity_threshold]
similar_df['GroupName'] = grouping[similar_df['GroupName']].values

In [None]:
similar_df[similar_df.GrouName=='antibiotics']

In [None]:
similar_df.to_csv('../results/embedding/{model}/entity_merging.csv'.format(model = model_iter))

## Determine coverage increase with merging

### Build dataframe

In [None]:
# Load attribute-value pairs
qiita_dir = "../../data/qiita/allQiita.pickle"
allQiita = pd.read_pickle(qiita_dir).dropna()

In [None]:
qiita_df = pd.DataFrame(allQiita).reset_index()
qiita_df.columns = ['sample_id', 'attribute', 'value']
del allQiita

In [None]:
similar_df['value count'] = qiita_df['attribute'].value_counts()[similar_df['attribute'].values].values

In [None]:
counts_df = similar_df[similar_df['attribute'].isin(['age', 'cell type', 'disease', 'molecular data type',
                                        'genotype', 'platform', 'protocol', 'sex', 'SCIENTIFIC_NAME',
                                        'strain', 'tissue'])].sort_values('GroupName').drop('I', axis = 1)
counts_df['merged count'] = similar_df.groupby('GroupName').sum()['value count'].values
counts_df['factor of increase'] = counts_df['merged count']/counts_df['value count']

In [None]:
display(counts_df)

In [None]:
counts_df.to_csv('../results/embedding/{model}/entity_coverage.csv'.format(model=model_iter))
counts_df.to_csv('../doc/figures/Supplementary/Supp_Table1.csv'.format(model=model_iter))

## Analyze word embedding space

In [None]:
from sklearn import manifold, decomposition
import seaborn as sns

In [None]:
keys = nlp.vocab.vectors.keys()

In [None]:
word_vec_df = pd.DataFrame(nlp.vocab.vectors.data, index = keys)

In [None]:
word_vec_df.head()

In [None]:
keyToString={}
keys = nlp.vocab.vectors.keys()
for key in keys:
    if key in nlp.vocab.strings: 
        keyToString[key] = nlp.vocab.strings[key]

In [None]:
keyToStringS = pd.Series(keyToString)

In [None]:
strings = keyToStringS[nlp.vocab.vectors.keys()].values

In [None]:
word_vec_df.index = strings

### <font color=red>Figure3A:</font> PCA of class based terms

In [None]:
classes = word_vec_df.loc[['male', 'female', 'hermaphrodite', 
                           'atherosclerosis', 'cancer', "Alzheimer's",
                           'Microarray', 'RNA-seq', 'ChIP-seq',
                           'weeks', 'years', 'hours']]

In [None]:
dimensionReduction = decomposition.PCA(n_components = 2)
M = dimensionReduction.fit_transform(classes)

In [None]:
tmpDf = pd.DataFrame(data=M,index=classes.index)

In [None]:
classes.T.corr()

In [None]:
var_exp = dimensionReduction.explained_variance_ratio_
PC_1 = var_exp[0]*100
PC_2 = var_exp[1]*100

In [None]:
fig,ax=plt.subplots(figsize=(6,6))
tmpDf.plot(x=0,y=1,kind='scatter',ax=ax,s=14)
for name,row in tmpDf.iterrows():
    ax.annotate(xy=(row[0],row[1]),s=name, fontsize=16)
ax.set_xlim(right=3.0)
ax.set_xlabel('PC1 (%.2f%%)' % PC_1)
ax.set_ylabel('PC2 (%.2f%%)' % PC_2)
plt.savefig('../doc/figures/Figure3/Figure3A.eps', dpi=600, bbox_inches="tight")
plt.savefig('../doc/figures/Figure3/Figure3A.png', dpi=600, bbox_inches="tight")
plt.close();

### <font color=red>Figure3B:</font> Cosine similarity based hierarchical clustering of class based terms

In [None]:
sns.clustermap(data=classes.T.corr(), metric='cosine', z_score=True)
plt.savefig('../doc/figures/Figure3/Figure3B.eps', dpi=600, bbox_inches="tight")
plt.savefig('../doc/figures/Figure3/Figure3B.png', dpi=600, bbox_inches="tight")
plt.close();

### <font color=red>Figure3C:</font> PCA of data type based terms

In [None]:
types = word_vec_df.loc[['DNase-seq', 'ChIP-seq', 'immunoprecipitation-sequencing', 
                         'mRNA-seq', 'RNA-sequencing', "RNA-seq",
                         'bisulfite-sequencing', 'Bis-seq', 'Methyl-seq', 'bisulfite-seq']]

In [None]:
dimensionReduction = decomposition.PCA(n_components = 2)
M = dimensionReduction.fit_transform(types)

In [None]:
tmpDf = pd.DataFrame(data=M,index=types.index)

In [None]:
var_exp = dimensionReduction.explained_variance_ratio_
PC_1 = var_exp[0]*100
PC_2 = var_exp[1]*100

In [None]:
fig,ax=plt.subplots(figsize=(6,6))
tmpDf.plot(x=0,y=1,kind='scatter',ax=ax,s=14)
for name,row in tmpDf.iterrows():
    ax.annotate(xy=(row[0],row[1]),s=name, fontsize=16)
ax.set_xlim(right=3.0)
ax.set_xlabel('PC1 (%.2f%%)' % PC_1)
ax.set_ylabel('PC2 (%.2f%%)' % PC_2)
plt.savefig('../doc/figures/Figure3/Figure3C.eps', dpi=600, bbox_inches="tight")
plt.savefig('../doc/figures/Figure3/Figure3C.png', dpi=600, bbox_inches="tight")
plt.close();

### <font color=red>Figure3D:</font> Cosine similarity based hierarchical clustering of data type based terms

In [None]:
sns.clustermap(data=types.T.corr(), metric='cosine', z_score=True)
plt.savefig('../doc/figures/Figure3/Figure3D.eps', dpi=600, bbox_inches="tight")
plt.savefig('../doc/figures/Figure3/Figure3D.png', dpi=600, bbox_inches="tight")
plt.close();

### <font color=red>Figure1F:</font> Cosine similarity based hierarchical clustering for methodology figure

In [None]:
terms = word_vec_df.loc[['RNA-sequencing', 'RNA-seq', 
                         'keratinocytes', 'epidermis',
                         'psoriasis', 'inflammation']]

In [None]:
dimensionReduction = decomposition.PCA(n_components = 2)
M = dimensionReduction.fit_transform(terms)

In [None]:
tmpDf = pd.DataFrame(data=M,index=terms.index)

In [None]:
var_exp = dimensionReduction.explained_variance_ratio_
PC_1 = var_exp[0]*100
PC_2 = var_exp[1]*100

In [None]:
fig,ax=plt.subplots(figsize=(6,6))
tmpDf.plot(x=0,y=1,kind='scatter',ax=ax,s=14)
for name,row in tmpDf.iterrows():
    if name == 'inflammation':
        ax.annotate(xy=(row[0],row[1]),s='psoriasis vulgaris', fontsize=20)
    else:
        ax.annotate(xy=(row[0],row[1]),s=name, fontsize=20)
ax.set_xlim(right=3.0)
ax.set_xlabel('First principal component' % PC_1, fontsize=24)
ax.set_ylabel('Second principal component' % PC_2, fontsize=24)
plt.savefig('../doc/figures/Figure1/Figure1F.eps', dpi=600, bbox_inches="tight")
plt.savefig('../doc/figures/Figure3/Figure1F.png', dpi=600, bbox_inches="tight")
plt.close();