# Building word vectors

# Setup

In [1]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../lib')

import gc
import random
import smart_open
import h5py
import csv
import tensorflow as tf
import gensim

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

random_state_number = 967898

In [2]:
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()

['/gpu:0', '/gpu:1']

In [3]:
%pylab
%matplotlib inline
%load_ext autoreload
%autoreload

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

# Data

## load corpus vocab and wordidx

In [13]:
corpus_vocab_list, corpus_vocab_wordidx = None, None
with open('processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
    (corpus_vocab_list, corpus_wordidx) = pickle.load(f)
print(len(corpus_vocab_list), len(corpus_wordidx))

352220 352220


## load data

In [7]:
store = pd.HDFStore('processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']

# Word Vectors Pre Trained

## collecting biolab words

In [9]:
from gensim.models.keyedvectors import KeyedVectors
biolab_keyed_vectors_pubmed_pmc_wiki = KeyedVectors.load_word2vec_format('external/biolab_wvs/wikipedia-pubmed-and-PMC-w2v.bin', binary=True)

In [10]:
biolab_words_pubmed_pmc_wiki = biolab_keyed_vectors_pubmed_pmc_wiki.vocab.keys()
biolab_words = set(biolab_words_pubmed_pmc_wiki)
len(biolab_words)

5443656

In [12]:
vocab_biolab = set(biolab_words) & set(vocab_words)
print (len(vocab_biolab))
vocab_biolab

100489


{'inhibitor1',
 'expression19',
 'espinosa',
 'plate-shaped',
 '5359',
 'adrenal',
 'auroras',
 'de-emphasized',
 'bootstrap',
 'glyceraldehyde-3',
 '3.359',
 '16/121',
 'attenuated',
 '24.29',
 'dounced',
 'uroplakins',
 'splice-donor',
 'gplots',
 '8/117',
 'num',
 'featuresof',
 'snail',
 'favored',
 'greater-than-additive',
 'forty-seven',
 'augment',
 'grouch',
 'de-activating',
 'multiplies',
 'renaming',
 'bx',
 'vy',
 'eosinophlia',
 'tumors.2',
 'excretory',
 'ofapoptotic',
 'typi',
 'under-estimated',
 'maya',
 'extraosseous',
 'rho/rac',
 'ci',
 'galt',
 'anti-b-actin',
 '1-year',
 '0562',
 'hypodiploid',
 'regions.5',
 'isoform-selective',
 'entero-pancreatic',
 'cpt-11',
 's-protein',
 'ns/s',
 'caldesmon',
 'p50/p50',
 'translatable',
 'cytoprotection',
 'acneform',
 'defeating',
 '10p11.23',
 'pharmacotherapy',
 'haemorrhagic',
 '0.262',
 '0.20-0.78',
 'r=0.352',
 'mice19',
 'instituted',
 '4.5.4',
 '0.04',
 'refractions',
 'whenas',
 '20.47',
 'cells.15',
 'villin-like'

In [14]:
vocab_not_in_biolab =set(vocab_words) - set(biolab_words)
print(len(vocab_not_in_biolab))
vocab_not_in_biolab

251731


{'',
 'mscv-nup214-abl1-ires-gfp',
 'limma.18',
 'c3h/1oth/2',
 '0.41.0',
 'stablee',
 'gdc-0879mediated',
 'a62t',
 'tumorsnamely',
 'her2-so/cep17-sg',
 'buffersubcloned',
 'measurementss.d',
 'cbreast',
 'rppametastasis-associated',
 'phenylalanine18',
 'detectableboard',
 'c.17981799gt',
 'ofdoes',
 'olfm4',
 'fkbp12rapamycin',
 'phenotype.lambdoid',
 'fragment17',
 'slc34a2',
 '20gap',
 'suppressorwell',
 'saciisite',
 'reportednrnac.1a',
 'd177y',
 'observeddownloadin',
 'p454s',
 'only.7.four',
 'ganglioglioma.25',
 'glissons',
 'puastderkwt',
 'lamp2-positive',
 'a/gagarose',
 'dahln',
 'obstructiona',
 'etv1suggest',
 'e32g',
 'catga',
 'identity22',
 'mm00455685_m1',
 'cys72asp74',
 'itdalleles',
 'coimmunoprecipitatedwith',
 'andpepstatin',
 'micej',
 '4016m1r219',
 'treatment-relatedareas',
 'smap.23',
 'r70w',
 'pt3n1',
 'genessupplementaryincluding',
 '0.064-2.262',
 '2a763v',
 'observations29',
 'p14arf/dapk/p53',
 'a314vexhibited',
 '321a',
 'severalfragment',
 'cellrea

dont need word to id dict since this is indexed with words

## using biolab words for missing corpus words

In [17]:
undesirable_ascii_characters = list(range(32))
undesirable_ascii_characters.remove(10) #keep new line since this might be used for sentence tokenizer
undesirable_charmap = dict.fromkeys(undesirable_ascii_characters)

In [20]:
from nltk import word_tokenize
from utils import custom_word_tokenizer, apply_custom_regx

custom_tokenized_biolab_pubmed_pmc_wiki_wv = {}
for word in vocab_biolab:
    vector = biolab_keyed_vectors_pubmed_pmc_wiki.word_vec(word)
    custom_tokenized_biolab_pubmed_pmc_wiki_wv[word.lower()] = vector
    word = word.lower().encode('ascii', 'ignore').decode('utf-8', 'ignore')
    word = str(word).translate(undesirable_charmap)
    word = apply_custom_regx(word)
    word = word.replace('\\t', '')
    for part in word_tokenize(word):
        if part in custom_tokenized_biolab_pubmed_pmc_wiki_wv:
            custom_tokenized_biolab_pubmed_pmc_wiki_wv[part] += vector
            custom_tokenized_biolab_pubmed_pmc_wiki_wv[part] /= 2

In [21]:
len(custom_tokenized_biolab_pubmed_pmc_wiki_wv)

100489

### for tensorboard

In [27]:
tb_vocab_size=5000

In [38]:
tb_vocab_biolab = list(vocab_biolab)[:tb_vocab_size]
with open("view_wvs_tb/tb_vocab.tsv", "w") as fp:
    wr = csv.writer(fp, delimiter='\n')
    wr.writerow(tb_vocab_biolab)

tb_word_vectors = np.random.randn(tb_vocab_size, 200)
for i,word in enumerate(tb_vocab_biolab):
    tb_word_vectors[i] = custom_tokenized_biolab_pubmed_pmc_wiki_wv[word]

In [40]:
%autoreload
from utils import visualize_embeddings_in_tensorboard
visualize_this_embedding = tb_word_vectors
print(visualize_this_embedding.shape)
metadata_path = "/home/bicepjai/Projects/dsotc/data_prep/view_wvs_tb/tb_vocab.tsv"
visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, "/home/bicepjai/Projects/dsotc/data_prep/view_wvs_tb")

(5000, 200)


In [35]:
del tb_word_vectors

## building word vectors of 200d for model
    

In [22]:
corpus_word_vectors = np.random.randn(len(vocab_words), 200)
corpus_word_vectors.shape

(352220, 200)

fill in biolab vectors available

In [23]:
for word in vocab_biolab:
    dataset_corpus_word_index = vocab_wordidx[word]
    corpus_word_vectors[dataset_corpus_word_index] = custom_tokenized_biolab_pubmed_pmc_wiki_wv[word]

total words not updated with training from biolab

In [24]:
words_not_updated = set(vocab_words) - vocab_biolab
len(words_not_updated)

251731

In [25]:
words_not_updated

{'',
 'mscv-nup214-abl1-ires-gfp',
 'limma.18',
 'c3h/1oth/2',
 '0.41.0',
 'stablee',
 'gdc-0879mediated',
 'a62t',
 'tumorsnamely',
 'her2-so/cep17-sg',
 'buffersubcloned',
 'measurementss.d',
 'cbreast',
 'rppametastasis-associated',
 'phenylalanine18',
 'detectableboard',
 'c.17981799gt',
 'ofdoes',
 'olfm4',
 'fkbp12rapamycin',
 'phenotype.lambdoid',
 'fragment17',
 'slc34a2',
 '20gap',
 'suppressorwell',
 'saciisite',
 'reportednrnac.1a',
 'd177y',
 'observeddownloadin',
 'p454s',
 'only.7.four',
 'ganglioglioma.25',
 'glissons',
 'puastderkwt',
 'lamp2-positive',
 'a/gagarose',
 'dahln',
 'obstructiona',
 'etv1suggest',
 'e32g',
 'catga',
 'identity22',
 'mm00455685_m1',
 'cys72asp74',
 'itdalleles',
 'coimmunoprecipitatedwith',
 'andpepstatin',
 'micej',
 '4016m1r219',
 'treatment-relatedareas',
 'smap.23',
 'r70w',
 'pt3n1',
 'genessupplementaryincluding',
 '0.064-2.262',
 '2a763v',
 'observations29',
 'p14arf/dapk/p53',
 'a314vexhibited',
 '321a',
 'severalfragment',
 'cellrea

In [26]:
np.save("processed/stage1/biolab_updated_wvs.npy", corpus_word_vectors)

## gcloud tensorboard serving

In [14]:
dataset_corpus_words_list = np.load("dataset_corpus_words_list.npy")
corpus_word_vectors = np.load("corpus_word_vectors.npy")

In [15]:
tb_vocab_size = 10000

In [None]:
local_tb_dir = "/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/gcloud/"

In [34]:
with open(local_tb_dir+"/vocab.tsv", "wb") as fp:
    wr = csv.writer(fp, delimiter='\n')
    wr.writerow(dataset_corpus_words_list[:tb_vocab_size])

for http://projector.tensorflow.org/ vectors need to be in tsv form

In [13]:
# np.savetxt("model_wv_visualize/word_vectors.tsv",corpus_word_vectors[:tb_vocab_size], delimiter='\t')

write to checkpoint file

In [30]:
!rm $local_tb_dir/checkpoint
!ls $local_tb_dir

rm: cannot remove '/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/checkpoint': No such file or directory


In [32]:
from word2vec import visualize_embeddings_in_tensorboard
visualize_this_embedding = corpus_word_vectors[:tb_vocab_size]
print visualize_this_embedding.shape
# path for gcloud tensorboard
metadata_path = "/home/bicepjai/projects/tb_visual/vocab.tsv"
# metadata_path = "/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/vocab.tsv"
visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, local_tb_dir)

(10000, 200)


In [33]:
checkpoint_txt = "model_checkpoint_path: \"/home/bicepjai/projects/tb_visual/visual_embed.ckpt-1\"\n\
all_model_checkpoint_paths: \"/home/bicepjai/projects/tb_visual/visual_embed.ckpt-1\""
with open(local_tb_dir+"/checkpoint","w") as f:
    f.seek(0)
    f.truncate()
    f.write(checkpoint_txt)

# FastText Vectors

### fasttext commands used

fasttext skipgram -minCount 1 -dim 200 -epoch 10 -input corpus_text_for_fast_text.txt -output ft_wvs_200d_10e

fasttext cbow -minCount 1 -dim 200 -epoch 10 -input corpus_text_for_fast_text.txt -output ft_wvs_200d_10e

### reading ft vectors

In [32]:
fasttext_vec_file = "processed/stage2/pretrained_word_vectors/ft_sg_200d_10e.vec"

In [33]:
ft_lines = None
with open(fasttext_vec_file,"r") as f:
    ft_lines = f.readlines()

In [34]:
print(ft_lines[0])
print(type(ft_lines), len(ft_lines))
ft_shape = tuple([int(i.strip()) for i in ft_lines[0].split()])
ft_shape

362933 200

<class 'list'> 362934


(362933, 200)

In [35]:
print(len(ft_lines[1].split()))
ft_lines[1]

201


'the 0.027251 -0.018114 0.0096083 0.076723 -0.29626 0.05729 0.17298 0.097187 0.10251 0.16822 -0.40156 0.12471 0.11843 0.069956 0.031858 -0.20362 0.18791 -0.20113 -0.20219 0.002323 -0.30366 0.16106 -0.091842 0.028771 -0.082447 0.18842 0.02471 -0.10553 -0.28138 0.044856 -0.041988 -0.031351 0.25131 -0.18547 0.23941 -0.18438 0.12292 -0.039016 0.075311 0.028379 0.024822 -0.069827 0.054794 0.19297 0.19053 -0.15749 0.21978 -0.003489 -0.15063 -0.018887 0.05638 0.1385 0.10112 0.023256 -0.22436 -0.27619 -0.047866 -0.053595 0.010177 0.059109 0.078079 0.080721 -0.017329 0.29334 0.19386 0.1279 0.04759 0.11951 -0.37341 -0.028312 0.0086509 0.021498 0.049069 0.094658 -0.076768 0.00541 -0.0013258 -0.062564 -0.092488 0.15718 0.21148 0.11005 0.088614 0.17268 0.057106 -0.0044174 -0.0072504 0.01389 -0.067416 -0.18715 -0.009639 0.12991 0.11389 -0.0017624 0.020464 -0.19809 -0.038933 -0.016631 -0.24906 0.012139 0.21376 0.14972 -0.16496 0.3738 -0.095022 0.10864 -0.058577 -0.034298 0.0021112 -0.010114 -0.024814

In [36]:
ft_vocab_size=ft_shape[0]
ft_vocab_size

362933

In [37]:
ft_word_vectors = np.random.randn(ft_vocab_size, ft_shape[1])
ft_words = []

In [38]:
for i, line in enumerate(ft_lines[1:]):
    str_list =line.split()
    ft_words.append(str_list[0].strip())
    vec = np.array([np.float(f) for f in str_list[1:]])
    ft_word_vectors[i] = vec

In [39]:
ft_word_vectors.shape

(362933, 200)

In [40]:
a = list(ft_words)
a.sort(key=len, reverse=True)
print(a[:10])
del a

['k2950n1.6.2501.857.151010r2108h2.722.55.521.745.481010s1733f8.88.7608.121.33108g1529r4.123.9708.091.23108i2285v5.492.4107.97.86107l1019v3.81.543.186.462.86106a75p3.52.312.516.342.20106t3349a.711.233.45.352.22105r1190w1.991.172.125.271.88105p1819s4.47.771.535.231.70105t630i4.64.5805.231.68105g1771d4.97.1505.121.33105k1690n4.49.5305.021.05105s1172l3.36.56.894.816.48104q2384k1.73.35.264.796.11104c554w1.31.33.74.715.15104d2312v.062.731.914.573.76104ivs26-20ct1.33.71.534.483.04104g602r.89.173.594.32.02104e462g1.39.641.823.846,960n56t.97.112.963.826,666h2074n3.5.3103.816,513r2973c1.39.262.633.755,685c3198r1.651.22.873.745,541i1929v.371.931.33.593,914n1228d.351.182.043.573,726h1918y2.31.2503.553,552v1306i1.891.21.373.472,979y3098h3.45.71.73.462,892v2969m2.06.82.573.452,806v894i.91.011.483.392,440i1349t.642.7203.372,320q1396r.021.741.63.362,280n2113s1.03.991.233.241,749r2842h.851.231.123.21,574ivs25+9ac.481.011.73.191,553v3079i.82.3803.191,545ivs11-20ta.83.232.523.111,288r2888c1.111.35.643.1

In [41]:
ft_wordidx = {w:i for i,w in enumerate(ft_words)}
ft_vocab_size, len(ft_wordidx)

(362933, 362933)

In [42]:
len(set(vocab_words) - set(ft_words))

1677

In [43]:
set(vocab_words) - set(ft_words)

{'lrp4',
 'dfnb59',
 'a75p',
 'g1803a',
 'e1682v',
 'atp7b',
 'fxn',
 'r561c',
 't417_d419delinsi',
 'mocs1',
 'r1726g',
 'scn4a',
 'pdha1',
 'c420g',
 'g2420c',
 'prkn',
 'dpys',
 'e1051k',
 'v384d',
 'prpf31',
 'plod2',
 'h1805p',
 's860l',
 'g776delinslc',
 's241y',
 'washc5',
 'r112g',
 'siae',
 'r342w',
 'q2416*',
 'ctns',
 'rtn4r',
 'e685v',
 'q1811r',
 'a41s',
 'a500t',
 'm1663l',
 'gdi1',
 'gnmt',
 'd603g',
 'adgrg1',
 'mtmr14',
 'slc12a3',
 'r421*',
 'l37p',
 'e1356g',
 'h773dup',
 'g598a',
 's216f',
 'p2417a',
 'cfl2',
 'y599_d600inspapqimststlisenmnia',
 'x582_splice',
 'cngb3',
 'bbs4',
 'vangl2',
 'r505l',
 'bckdhb',
 'd737v',
 'i2675v',
 'arsb',
 'r331p',
 'i1718t',
 'a2351g',
 'l1844r',
 'r2336p',
 't244_i245inscpt',
 'r280g',
 'a72d',
 'grxcr1',
 'cldn14',
 'aptx',
 'phex',
 'slc19a3',
 'r100*',
 'hps3',
 'gjb1',
 's453fs*',
 'krt86',
 'atg16l1',
 'pafah1b1',
 'a60v',
 't1685a',
 'm1663k',
 'ftcd',
 'hip1-pdgfrb',
 'l747_a750del',
 'n480del',
 'a57v',
 'dock8',
 'e554_k

In [80]:
%autoreload
import global_utils
fasttext_vec_file="/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_20e.vec"
wvs = global_utils.get_corpus_wvs_from_ft(fasttext_vec_file, 200, vocab_words)
wvs.shape

AssertionError: fast text some vectors doesn't match dimensions200 != 20

### saving all trained fast text vectors

In [99]:
%ll /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors

total 550348
-rwxrwxr-x 1 bicepjai 563552080 Sep 24 11:30 [0m[01;32mbiolab_updated_wvs.npy[0m*


In [103]:
len(vocab_words)

352220

In [104]:
%autoreload
import global_utils
ft_vector_files = [
                   (100,"ft_cbow_100d_20e"),(200,"ft_cbow_200d_20e"),(200,"ft_cbow_300d_20e"),
                   (100,"ft_sg_100d_20e"),(200,"ft_sg_200d_20e"),(200,"ft_sg_300d_20e"),
                   (100,"ft_cbow_100d_50e"),(200,"ft_cbow_200d_50e"),(200,"ft_cbow_300d_50e"),
                   (100,"ft_sg_100d_50e"),(200,"ft_sg_200d_50e"),(200,"ft_sg_300d_50e"),
                   (100,"ft_cbow_100d_100e"),(200,"ft_cbow_200d_100e"),(200,"ft_cbow_300d_100e"),
                   (100,"ft_sg_100d_100e"),(200,"ft_sg_200d_100e"),(200,"ft_sg_300d_100e")
                  ]

for dim_file_name in ft_vector_files:
    file_path = "/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/"+dim_file_name[1]+".vec"
    dim = dim_file_name[0]
    if not os.path.exists(file_path):
        print("file doesnt exist",file_path)
        continue
    ft_vec = global_utils.get_corpus_wvs_from_ft(file_path, dim, vocab_words)
    print(ft_vector_file,ft_vec.shape)
    np.save("processed/stage1/pretrained_word_vectors/"+dim_file_name[1]+".npy", ft_vec)

file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_100d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_300d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_100d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_300d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_100d_50e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_50e.vec
file doesnt exist /home/bicepjai/Projects/dsot

(367260, 200)

# Viewing word vectors

In [9]:
%autoreload
import global_utils

In [14]:
WORD_EMB_SIZE=200
ft_file_path = "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_50e.vec"
trained_embeddings = global_utils.get_embeddings_from_ft(ft_file_path, WORD_EMB_SIZE, corpus_vocab_list)
trained_embeddings.shape

(352220, 200)

In [16]:
tb_vocab_size=5000

In [17]:
tb_vocab_biolab = list(trained_embeddings)[:tb_vocab_size]
with open("view_wvs_tb/tb_vocab.tsv", "w") as fp:
    wr = csv.writer(fp, delimiter='\n')
    wr.writerow(corpus_vocab_list)

tb_word_vectors = np.random.randn(tb_vocab_size, 200)
for i,word in enumerate(tb_vocab_biolab):
    tb_word_vectors[i] = trained_embeddings[i]

In [22]:
%autoreload
from utils import visualize_embeddings_in_tensorboard
visualize_this_embedding = tb_word_vectors
print(visualize_this_embedding.shape)
metadata_path = "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/view_wvs_tb/tb_vocab.tsv"
visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/view_wvs_tb")

(5000, 200)
